from numpy import *
import pandas as pd

from sklearn.decomposition import PCA


def pca_relevance(dataMat, topNfeat=2):
    meanVals = mean(dataMat, axis=0)  # 对axis = 0 上的数据求平均
    meanRemoved = dataMat - meanVals  # 每列的数据
    covMat = cov(meanRemoved, rowvar=0)  # 将每列作为特征求协方差矩阵
    eigVals, eigVects = linalg.eig(mat(covMat))  # 求协方差矩阵的特征值 和 特征向量
    eigValInd = argsort(eigVals)  # 将特征值按照从小到大排序
    eigValInd = eigValInd[:-(topNfeat + 1):-1]  # 取从大到小（上面排序的逆序）去前topNfeat个数，如果topNfeat大于 列表的长度的话，取全部
    redEigVects = eigVects[:, eigValInd]  # 特征向量
    lowDDataMat = meanRemoved * redEigVects  # 将数据转换到新空间
    return lowDDataMat


def pca_distribution():
    pca = PCA(n_components='mle')  # 此时PCA类会用MLE算法根据特征的方差分布情况自己去选择一定数量的主成分特征来降维
    pca.fit(X2)
    print(pca.explained_variance_ratio_)
    # xplained_variance_ratio_：array, [n_components]返回 所保留的n个成分各自的方差百分比,这里可以理解为单个变量方差贡献率
    print(pca.n_components_)
    print(pca.fit_transform(X2))
    return pca.fit_transform(X2)


if __name__ == '__main__':
    a = pd.read_csv(
        r'C:\Users\13780\Desktop\Mulangit\visual-recommendation-2\KaggleBench\benchmark_manager\all_data.csv')
    X = mat(a.iloc[:, 1:4])  # 选择pearson	spearman	kendall
    X[isnan(X)] = mean(X[~isnan(X)])  # 将矩阵中的nan替代为平均值
    X2 = mat(a.iloc[:, 4:9])

    rel = pca_relevance(X)
    print("rel",type(rel))
    dis = asmatrix(pca_distribution())
    print("dis",type(dis))
    deep = hstack((rel, dis))
    Y = mat(a.iloc[:, 9:12])
    L = mat(a.iloc[:, 0:1])
    wide = hstack((L, Y))
    con = hstack((deep, wide))
    savetxt('svmData.csv', con, delimiter=',')

